1 基于iris数据的示例

1.1 报告描述性统计量

- User提问: 在R中报告iris中各个变量的均值、中位数、标准差、最小值和最大值。

# 加载iris数据集
data(iris)

# 计算均值
mean_values <- round(colMeans(iris[, 1:4]), 3)

# 计算中位数
median_values <- sapply(iris[, 1:4], median)

# 计算标准差
sd_values <- round(sapply(iris[, 1:4], sd), 3)

# 计算最小值
min_values <- sapply(iris[, 1:4], min)

# 计算最大值
max_values <- sapply(iris[, 1:4], max)

# 创建一个数据框包含统计量
statistics <- data.frame(
  Variable = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
  Mean = mean_values,
  Median = median_values,
  SD = sd_values,
  Min = min_values,
  Max = max_values
)

# 打印报告
print(statistics)
                 Variable  Mean Median    SD Min Max
Sepal.Length Sepal.Length 5.843   5.80 0.828 4.3 7.9
Sepal.Width   Sepal.Width 3.057   3.00 0.436 2.0 4.4
Petal.Length Petal.Length 3.758   4.35 1.765 1.0 6.9
Petal.Width   Petal.Width 1.199   1.30 0.762 0.1 2.5

- User: 结果中最小值和最大值都只有1位小数,如何修改?

# 加载iris数据集
data(iris)

# 计算均值
mean_values <- format(round(colMeans(iris[, 1:4]), 3), nsmall = 3)

# 计算中位数
median_values <- format(round(sapply(iris[, 1:4], median), 3), nsmall = 3)

# 计算标准差
sd_values <- format(round(sapply(iris[, 1:4], sd), 3), nsmall = 3)

# 计算最小值
min_values <- format(round(sapply(iris[, 1:4], min), 3), nsmall = 3)

# 计算最大值
max_values <- format(round(sapply(iris[, 1:4], max), 3), nsmall = 3)

# 创建一个数据框包含统计量
statistics <- data.frame(
  Variable = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
  Mean = mean_values,
  Median = median_values,
  SD = sd_values,
  Min = min_values,
  Max = max_values
)

# 移除Variable列名
rownames(statistics) <- NULL

# 打印报告
print(statistics)
      Variable  Mean Median    SD   Min   Max
1 Sepal.Length 5.843  5.800 0.828 4.300 7.900
2  Sepal.Width 3.057  3.000 0.436 2.000 4.400
3 Petal.Length 3.758  4.350 1.765 1.000 6.900
4  Petal.Width 1.199  1.300 0.762 0.100 2.500

- User: 如何将数据框导出到word文档?

library(officer)

# 创建一个空白的Word文档
doc <- read_docx()

# 添加数据框
table <- as.data.frame(statistics)
doc <- body_add_table(doc, table)

# 保存Word文档
print(doc, target = "iris_table.docx")
描述性统计分析导出到Word文档
描述性统计分析导出到Word文档

- User: 按Species分组,报告各组均值,结果保留3位小数?

# 加载iris数据集
data(iris)

# 按Species分组计算均值,并保留3位小数
group_statistics <- aggregate(. ~ Species, data = iris, FUN = function(x) {
  round(mean(x), 3)
})

# 打印分组均值
print(group_statistics)
     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1     setosa        5.006       3.428        1.462       0.246
2 versicolor        5.936       2.770        4.260       1.326
3  virginica        6.588       2.974        5.552       2.026

1.2 绘制分组箱线图

- User提问: 按Species分组,绘制各变量的箱线图。

# 加载iris数据集
data(iris)

# 按Species分组绘制箱线图
par(mfrow = c(2, 2))  # 设置图形布局为2行2列

# 绘制Sepal.Length的箱线图
boxplot(Sepal.Length ~ Species, data = iris, 
        main = "Sepal Length", xlab = "Species", ylab = "Sepal Length")

# 绘制Sepal.Width的箱线图
boxplot(Sepal.Width ~ Species, data = iris, main = "Sepal Width", xlab = "Species", ylab = "Sepal Width")

# 绘制Petal.Length的箱线图
boxplot(Petal.Length ~ Species, data = iris, main = "Petal Length", xlab = "Species", ylab = "Petal Length")

# 绘制Petal.Width的箱线图
boxplot(Petal.Width ~ Species, data = iris, main = "Petal Width", xlab = "Species", ylab = "Petal Width")

- User提问: 用ggplot2绘制按Species分组后,各变量的箱线图。

# 加载所需的库
library(ggplot2)

# 加载iris数据集
data(iris)

# 按Species分组绘制箱线图
ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
  geom_boxplot() +
  labs(title = "Sepal Length", x = "Species", y = "Sepal Length")

ggplot(iris, aes(x = Species, y = Sepal.Width, fill = Species)) +
  geom_boxplot() +
  labs(title = "Sepal Width", x = "Species", y = "Sepal Width")

ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
  geom_boxplot() +
  labs(title = "Petal Length", x = "Species", y = "Petal Length")

ggplot(iris, aes(x = Species, y = Petal.Width, fill = Species)) +
  geom_boxplot() +
  labs(title = "Petal Width", x = "Species", y = "Petal Width")

- User提问:让上述4个图形显示在同一窗口。

# 加载所需的库
library(ggplot2)

# 加载iris数据集
data(iris)

# 创建绘图对象,按Species分组绘制箱线图
p <- ggplot(iris, aes(x = Species, fill = Species)) +
  labs(x = "Species") +
  theme_minimal() +
  theme(legend.position = "none")

# 添加Sepal.Length的箱线图
p1 <- p +
  geom_boxplot(aes(y = Sepal.Length)) +
  labs(title = "Sepal Length")

# 添加Sepal.Width的箱线图
p2 <- p +
  geom_boxplot(aes(y = Sepal.Width)) +
  labs(title = "Sepal Width")

# 添加Petal.Length的箱线图
p3 <- p +
  geom_boxplot(aes(y = Petal.Length)) +
  labs(title = "Petal Length")

# 添加Petal.Width的箱线图
p4 <- p +
  geom_boxplot(aes(y = Petal.Width)) +
  labs(title = "Petal Width")

# 在同一窗口中显示四个箱线图
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)

- User提问: Error in loadNamespace(x) : there is no package called ‘gridExtra’

- User提问: 用ggpubr绘制iris的分组箱线图

library(ggplot2)
library(ggpubr)

# 加载数据集
data("iris")

# 绘制分组箱线图
ggboxplot(iris, x = "Species", y = "Sepal.Length", 
          color = "Species", palette = "jco",
          add = "jitter", 
          ylab = "Sepal Length", xlab = "Species",
          title = "Boxplot of Sepal Length by Species")

- User提问: 如何在图中标注两个总体均值的t检验的结果?

# 加载所需包
library(ggpubr)

# 载入iris数据集
data(iris)

# 绘制分组箱线图并标注t检验结果
ggboxplot(iris, x = "Species", y = "Sepal.Length", 
          color = "Species", palette = "jco",
          ylab = "Sepal Length", xlab = "Species",
          add = "jitter") +
  stat_compare_means(comparisons = list(c("setosa", "versicolor"),
                                        c("setosa", "virginica"),
                                        c("versicolor", "virginica")),
                     method = "t.test",
                     label = "p.format",
                     hide.ns = TRUE)

- User提问: 如何标注p值 ?

# 加载所需的库
library(ggpubr)

# 绘制分组箱线图并添加显著性标记
ggboxplot(iris, x = "Species", y = "Sepal.Length",
          color = "Species", palette = "jco",
          ylab = "Sepal Length", xlab = "Species",
          add = "mean", title = "Iris Sepal Length by Species") +
  stat_compare_means(comparisons = list(c("setosa", "versicolor"),
                                       c("setosa", "virginica"),
                                       c("versicolor", "virginica")),
                     method = "t.test", label = "p.signif") +
  theme_pubr()

1.3 方差分析

- User提问: 将iris按照Species分组,检验三个组别的Sepal.length的均值是否相等?

# 加载所需的库
library(dplyr)
library(ggplot2)

# 加载iris数据集
data(iris)

# 进行方差分析
anova_result <- aov(Sepal.Length ~ Species, data = iris)

# 检验均值是否相等
anova_summary <- summary(anova_result)

# 提取p-value
p_value <- anova_summary[[1]]$"Pr(>F)"[1]

# 输出方差分析结果
print(anova_summary)
             Df Sum Sq Mean Sq F value Pr(>F)    
Species       2  63.21  31.606   119.3 <2e-16 ***
Residuals   147  38.96   0.265                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n")
cat("p-value:", p_value)
p-value: 1.669669e-31

- User提问: 该数据满足ANOVA检验的适用条件吗?

- User提问: iris数据集每个组别内的Sepal.Length观测值是否近似正态分布?

# 加载所需的库
library(dplyr)
library(ggplot2)

# 加载iris数据集
data(iris)

# 按Species分组,进行正态性检验
normality_test <- iris %>%
  group_by(Species) %>%
  summarise(p_value = shapiro.test(Sepal.Length)$p.value)

# 输出正态性检验结果
print(normality_test)
# A tibble: 3 × 2
  Species    p_value
  <fct>        <dbl>
1 setosa       0.460
2 versicolor   0.465
3 virginica    0.258

- User提问: R 中进行方差齐性检验的函数是什么?

- User提问: bartlett.test()函数的结果如何看?

bartlett.test(iris$Sepal.Length,iris$Species)

    Bartlett test of homogeneity of variances

data:  iris$Sepal.Length and iris$Species
Bartlett's K-squared = 16.006, df = 2, p-value = 0.0003345

- User提问: iris数据集不同组别之间的Sepal.Length观测值方差不等,还能用ANOVA检验吗?

- User提问: Welch’s ANOVA的命令是什么?

# 导入iris数据集
data(iris)

# 执行Welch's ANOVA
result <- oneway.test(Sepal.Length ~ Species, data = iris)

# 打印结果
print(result)

    One-way analysis of means (not assuming equal variances)

data:  Sepal.Length and Species
F = 138.91, num df = 2.000, denom df = 92.211, p-value < 2.2e-16

1.4 判别分析

- User提问:基于iris数据,建立Species的判别函数?

- User提问: could not find function “lda”

# Install and load the MASS package
#install.packages("MASS")
library(MASS)

# Load the iris dataset
data(iris)

# Perform LDA
lda_model <- lda(Species ~ ., data = iris)

# Print the discriminant function results
print(lda_model)
Call:
lda(Species ~ ., data = iris)

Prior probabilities of groups:
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

Group means:
           Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa            5.006       3.428        1.462       0.246
versicolor        5.936       2.770        4.260       1.326
virginica         6.588       2.974        5.552       2.026

Coefficients of linear discriminants:
                    LD1         LD2
Sepal.Length  0.8293776 -0.02410215
Sepal.Width   1.5344731 -2.16452123
Petal.Length -2.2012117  0.93192121
Petal.Width  -2.8104603 -2.83918785

Proportion of trace:
   LD1    LD2 
0.9912 0.0088 

- User提问:输出结果如何解读?

- User提问:如何利用判别函数判定一朵鸢尾花属于哪个组别?

# 用于预测的鸢尾花特征向量
new_iris <- data.frame(Sepal.Length = 5.1, Sepal.Width = 3.5, Petal.Length = 1.4, Petal.Width = 0.2)

# 使用判别函数进行预测
predicted_group <- predict(lda_model, newdata = new_iris)

# 输出预测结果
print(predicted_group$class)
[1] setosa
Levels: setosa versicolor virginica

1.5 向ChatGPT提问技巧

  • 如何在R中实现XX计算/检验/方法?请用R自带的数据集举例说明。

  • 用XX包在R中实现XX计算/检验?请用R自带的数据集举例说明。

    • 指定包的名称,例如ggplot2
  • XX计算/检验/方法的适用条件是什么?

  • 在R中如何考察XX方法/检验的适用条件是否满足?

  • 如何解读XX函数的输出结果的含义?

  • R中的XX分析有哪些可视化工具?

2 如何借助ChatGPT开展统计分析?

2.1 描述数据结构和分析目标

- User提问: 我有实验组和对照组各10人的血压数据,如何判断两个组别的血压是否有显著差距?

- User提问: 我的两个组别分别只有10人,能用独立样本t检验吗?

2.2 统计方法的选择:样本容量、数据类型、分布特征

- User提问: 还有别的方法吗?

- User提问: Wilcoxon Signed-Rank Test的R代码是什么?

- User提问:我有实验组和对照组各10人的跟踪观测服药7天,14天和21天的血压数据,可以开展什么统计分析,来判断药物是否有效?

2.3 请求chatGPT创建模拟数据

- User提问:请在R中生成上述模拟数据,及实现Repeated Measures ANOVA的代码?

# 生成模拟数据
set.seed(123)  # 设置随机种子以保证结果的可重复性

# 创建数据框
data <- data.frame(
  Subject = rep(1:10, each = 3),  # 受试者编号,重复测量所以有10个受试者
  Time = rep(c("Day 7", "Day 14", "Day 21"), times = 10),  # 时间点
  Group = rep(c("Control", "Treatment"), each = 15),  # 组别
  BloodPressure = rnorm(30, mean = c(120, 130, 125), sd = c(5, 7, 6))  # 血压数据
)

# 查看数据
print(data)
   Subject   Time     Group BloodPressure
1        1  Day 7   Control      117.1976
2        1 Day 14   Control      128.3888
3        1 Day 21   Control      134.3522
4        2  Day 7   Control      120.3525
5        2 Day 14   Control      130.9050
6        2 Day 21   Control      135.2904
7        3  Day 7   Control      122.3046
8        3 Day 14   Control      121.1446
9        3 Day 21   Control      120.8789
10       4  Day 7   Control      117.7717
11       4 Day 14   Control      138.5686
12       4 Day 21   Control      127.1589
13       5  Day 7   Control      122.0039
14       5 Day 14   Control      130.7748
15       5 Day 21   Control      121.6650
16       6  Day 7 Treatment      128.9346
17       6 Day 14 Treatment      133.4850
18       6 Day 21 Treatment      113.2003
19       7  Day 7 Treatment      123.5068
20       7 Day 14 Treatment      126.6905
21       7 Day 21 Treatment      118.5931
22       8  Day 7 Treatment      118.9101
23       8 Day 14 Treatment      122.8180
24       8 Day 21 Treatment      120.6267
25       9  Day 7 Treatment      116.8748
26       9 Day 14 Treatment      118.1931
27       9 Day 21 Treatment      130.0267
28      10  Day 7 Treatment      120.7669
29      10 Day 14 Treatment      122.0330
30      10 Day 21 Treatment      132.5229
# 进行重复测量方差分析(Repeated Measures ANOVA)
library(nlme)

# 设置数据为重复测量设计
data$Subject <- factor(data$Subject)
data$Time <- factor(data$Time)

# 执行重复测量方差分析
model <- lme(BloodPressure ~ Time * Group, data = data, random = ~ 1 | Subject)
anova_result <- anova(model)

# 打印重复测量方差分析结果
print(anova_result)
            numDF denDF   F-value p-value
(Intercept)     1    16 13201.338  <.0001
Time            2    16     3.112  0.0721
Group           1     8     1.635  0.2369
Time:Group      2    16     1.151  0.3412

2.4 解读输出结果

- User提问: 解释结果 > # 打印重复测量方差分析结果

print(anova_result)

        numDF denDF   F-value p-value
        

(Intercept) 1 16 13201.338 <.0001

Time 2 16 3.112 0.0721

Group 1 8 1.635 0.2369

Time:Group 2 16 1.151 0.3412

3 如何学习ChatGPT生成的代码?

3.1 请ChatGPT给代码添加注释

3.2 请chatGPT解释函数的用法、参数项

- User提问:解释lme函数的用法?

- User提问:fixed formula如何设定

- User提问:random = ~ 1 |subject代表什么

3.3 代码报错的应对

3.4 如何模仿ChatGPT生成的代码?

- User提问:使用支持向量机的方法来对iris数据进行分析,报告测试集中的个案的预测组别、实际组别,以及预测是否正确,预测的准确率

# 导入所需的包
library(e1071)
library(dplyr)

# 划分训练集和测试集
set.seed(123)
train_indices <- sample(1:nrow(iris), nrow(iris) * 0.7)
train_data <- iris[train_indices, ]
test_data <- iris[-train_indices, ]

# 创建SVM模型
svm_model <- svm(Species ~ ., data = train_data)

# 预测测试集
predicted_labels <- predict(svm_model, test_data)

# 计算预测准确率
accuracy <- sum(predicted_labels == test_data$Species) / nrow(test_data)
cat("预测准确率:", accuracy, "\n")
预测准确率: 0.9777778 
# 创建结果列表
result_list <- data.frame(
  Individual = 1:nrow(test_data),
  Actual = test_data$Species,
  Predicted = predicted_labels,
  Correct = ifelse(test_data$Species == predicted_labels, "Correct", "Incorrect")
)

# 打印结果列表
print(result_list)
    Individual     Actual  Predicted   Correct
1            1     setosa     setosa   Correct
2            2     setosa     setosa   Correct
3            3     setosa     setosa   Correct
5            4     setosa     setosa   Correct
11           5     setosa     setosa   Correct
18           6     setosa     setosa   Correct
19           7     setosa     setosa   Correct
28           8     setosa     setosa   Correct
29           9     setosa     setosa   Correct
33          10     setosa     setosa   Correct
36          11     setosa     setosa   Correct
45          12     setosa     setosa   Correct
48          13     setosa     setosa   Correct
49          14     setosa     setosa   Correct
55          15 versicolor versicolor   Correct
56          16 versicolor versicolor   Correct
57          17 versicolor versicolor   Correct
58          18 versicolor versicolor   Correct
59          19 versicolor versicolor   Correct
61          20 versicolor versicolor   Correct
62          21 versicolor versicolor   Correct
65          22 versicolor versicolor   Correct
66          23 versicolor versicolor   Correct
68          24 versicolor versicolor   Correct
70          25 versicolor versicolor   Correct
77          26 versicolor versicolor   Correct
83          27 versicolor versicolor   Correct
84          28 versicolor  virginica Incorrect
94          29 versicolor versicolor   Correct
95          30 versicolor versicolor   Correct
98          31 versicolor versicolor   Correct
100         32 versicolor versicolor   Correct
101         33  virginica  virginica   Correct
104         34  virginica  virginica   Correct
105         35  virginica  virginica   Correct
111         36  virginica  virginica   Correct
113         37  virginica  virginica   Correct
116         38  virginica  virginica   Correct
125         39  virginica  virginica   Correct
131         40  virginica  virginica   Correct
133         41  virginica  virginica   Correct
135         42  virginica  virginica   Correct
140         43  virginica  virginica   Correct
141         44  virginica  virginica   Correct
145         45  virginica  virginica   Correct

- User提问:创建500个糖尿病患者的模拟数据,包括患者的年龄、BMI、血糖、胰岛素、腰围、锻炼、饮食、饮酒和吸烟习惯,糖尿病类型(1型、2型和妊娠性)。

- User提问:解释上述数据的含义。

set.seed(123)  # 设置随机种子以确保可重复性

# 生成500个糖尿病患者数据
diabetes_data <- data.frame(
  Age = round(rnorm(500, mean = 50, sd = 10)),
  BMI = round(rnorm(500, mean = 28, sd = 5)),
  Glucose = round(rnorm(500, mean = 120, sd = 30)),
  Insulin = round(rnorm(500, mean = 10, sd = 5)),
  Exercise = sample(c("None", "Light", "Moderate", "Heavy"), 500, replace = TRUE),
  Diet = sample(c("Healthy", "Unhealthy"), 500, replace = TRUE),
  Waist = round(rnorm(500, mean = 90, sd = 10)),
  Alcohol = sample(c("No", "Occasional", "Regular"), 500, replace = TRUE),
  Smoking = sample(c("No", "Occasional", "Regular"), 500, replace = TRUE),
  Type = sample(c("Type 1", "Type 2", "Gestational"), 500, replace = TRUE)
)

# 查看更新后的数据集
head(diabetes_data)
  Age BMI Glucose Insulin Exercise      Diet Waist    Alcohol    Smoking
1  44  25      90       6    Light   Healthy    83         No    Regular
2  48  23      89       8    Heavy   Healthy    96 Occasional    Regular
3  66  33     119       5    Light Unhealthy    83         No    Regular
4  51  32     116      13     None   Healthy    85 Occasional    Regular
5  51  20      44      16 Moderate Unhealthy    98 Occasional         No
6  67  28     151      21 Moderate   Healthy    85 Occasional Occasional
         Type
1      Type 2
2 Gestational
3      Type 1
4      Type 2
5 Gestational
6 Gestational
# 划分训练集和测试集
set.seed(123)
train_indices <- sample(1:nrow(diabetes_data), nrow(diabetes_data) * 0.7)
train_data <- diabetes_data[train_indices, ]
test_data <- diabetes_data[-train_indices, ]


# 将目标变量转换为因子类型
train_data$Type <- as.factor(train_data$Type)
test_data$Type <- as.factor(test_data$Type)

# 创建并训练支持向量机模型
svm_model <- svm(Type ~ ., data = train_data, kernel = "radial", cost = 1)

# 在测试集上进行预测
predicted_labels <- predict(svm_model, newdata = test_data)

# 计算预测准确率
accuracy <- sum(predicted_labels == test_data$Type) / nrow(test_data) * 100
cat("预测准确率:", round(accuracy, 2), "%\n")
预测准确率: 34 %
# 创建包含预测结果的表格
prediction_table <- data.frame(
  Actual = test_data$Type,
  Predicted = predicted_labels,
  Correct = ifelse(test_data$Type == predicted_labels, "Yes", "No")
)

# 查看预测结果表格
head(prediction_table)
        Actual   Predicted Correct
1       Type 2 Gestational      No
3       Type 1 Gestational      No
6  Gestational Gestational     Yes
8       Type 2 Gestational      No
9       Type 1 Gestational      No
12 Gestational      Type 1      No